/*
 * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

#ifndef HCCL_OP_RETRY_PUB_H
#define HCCL_OP_RETRY_PUB_H

#include <functional>
#include "hccl/base.h"
#include "hccl_common.h"
#include "stream_pub.h"
#include "hccl_socket.h"
#include "hdc_pub.h"
#include "notify_pool.h"
#include "aicpu_operator_pub.h"

namespace hccl {

constexpr u32 OPRETRY_DFX_IPINFO_LENGTH = 256;
constexpr u32 CONNECT_REMOTE_DEFAULT = 1;
constexpr u32 CONNECT_REMOTE_BACKUP = 2;
constexpr u32 ACTIVE_SWITCH_TIMES = 2;

typedef enum {
    // server状态
    RETRY_STATE_SERVER_RUNNING = 0,
    RETRY_STATE_CMD_CHECK_LINK,
    RETRY_STATE_WAIT_LINK_CHECKED,
    RETRY_STATE_CHECK_ALL_LINK,
    RETRY_STATE_CMD_CHANGE_LINK,
    RETRY_STATE_WAIT_LINK_CHANGED,
    RETRY_STETA_HANDLE_ALL_ERR,
    RETRY_STATE_CMD_STOP_AICPU, // 发送StopAicpu命令
    RETRY_STATE_WAIT_AICPU_STOPED, // 等待Aicpu停止
    RETRY_STATE_CMD_STOP_STREAM,
    RETRY_STATE_WAIT_STREAM_STOPED,
    RETRY_STATE_CMD_CLEAR_STREAM,
    RETRY_STATE_WAIT_STREAM_CLEARED,
    RETRY_STATE_CMD_STOP_TRANSPORT,
    RETRY_STATE_WAIT_STOP_TRANSPORT,
    RETRY_STATE_CMD_RESET_NOTIFY,
    RETRY_STATE_WAIT_NOTIFY_RESETED,
    RETRY_STATE_CMD_RESUME_TRANSPORT,
    RETRY_STATE_WAIT_RESUME_TRANSPORT,
    RETRY_STATE_CMD_CHECK,
    RETRY_STATE_WAIT_CHECK_INFO,
    RETRY_STATE_CHECK_OP,
    RETRY_STATE_CMD_CAN_RETRY,
    RETRY_STATE_WAIT_CAN_RETRY,
    RETRY_STATE_SERVER_RETRY_FAIL,
    RETRY_STATE_CMD_PLAN_SWITCH_NIC,
    RETRY_STATE_SERVER_WAIT_RESUME,
    RETRY_RESUME_STATE_SERVER_CHECK_LINK,
    RETRY_RESUME_STATE_SERVER_CHANGE_LINK,

    // agent状态
    RETRY_STATE_AGENT_RUNNING,
    RETRY_STATE_RESP_AICPU_ERR,
    RETRY_STATE_RESP_LINK_CHECKED,
    RETRY_STATE_WAIT_CMD_STOP_AICPU,
    RETRY_STATE_POLL_AICPU_STOPED,
    RETRY_STATE_RESP_AICPU_STOPED,
    RETRY_STATE_WAIT_CMD_STOP_STREAM,
    RETRY_STATE_POLL_STREAM_STOPED,
    RETRY_STATE_RESP_STREAM_STOPED,
    RETRY_STATE_WAIT_CMD_CLEAR_STREAM,
    RETRY_STATE_RESP_STREAM_CLEARED,
    RETRY_STATE_RESP_LINK_CHANGED,
    RETRY_STATE_WAIT_CHANGE_LINK_INFO,
    RETRY_STATE_POLL_AICPU_CHANGED,
    RETRY_STATE_WAIT_CMD_STOP_TRANSPORT,
    RETRY_STATE_RESP_STOP_TRANSPORT,
    RETRY_STATE_WAIT_CMD_RESET_NOTIFY,
    RETRY_STATE_RESP_NOTIFY_RESETED,
    RETRY_STATE_WAIT_CMD_CHECK_LINK,
    RETRY_STATE_WAIT_CMD_RESUME_TRANSPORT,
    RETRY_STATE_RESP_RESUME_TRANSPORT,
    RETRY_STATE_WAIT_CMD_CHECK,
    RETRY_STATE_RESP_CHECK_INFO,
    RETRY_STATE_WAIT_CMD_CAN_RETRY,
    RETRY_STATE_POLL_AICPU_RETRYEND,
    RETRY_STATE_RESP_AICPU_RETRYEND,
    RETRY_STATE_RESP_RUNNING_ERR,
    RETRY_STATE_WAIT_CMD_RETRY_FAIL,
    RETRY_STATE_AGENT_RETRY_FAIL,
    RETRY_STATE_SEND_SWITCH_INFO,
    RETRY_STATE_WAIT_CMD_SEND_AICPU,
    RETRY_STATE_AGENT_WAIT_RESUME,
    RETRY_RESUME_STATE_AGENT_CHECK_LINK,
    RETRY_RESUME_STATE_AGENT_CHANGE_LINK,

    RETRY_STATE_RESERVED,
} RetryState;

const std::map<RetryState, std::string> RETRY_STATE_STR_MAP {
    // server状态
    {RETRY_STATE_SERVER_RUNNING, "RETRY_STATE_SERVER_RUNNING"},
    {RETRY_STATE_CMD_CHECK_LINK, "RETRY_STATE_CMD_CHECK_LINK"},
    {RETRY_STATE_CMD_CHANGE_LINK, "RETRY_STATE_CMD_CHANGE_LINK"},
    {RETRY_STATE_WAIT_LINK_CHANGED, "RETRY_STATE_WAIT_LINK_CHANGED"},
    {RETRY_STATE_WAIT_LINK_CHECKED, "RETRY_STATE_WAIT_LINK_CHECKED"},
    {RETRY_STATE_CHECK_ALL_LINK, "RETRY_STATE_CHECK_ALL_LINK"},
    {RETRY_STETA_HANDLE_ALL_ERR, "RETRY_STETA_HANDLE_ALL_ERR"},
    {RETRY_STATE_CMD_STOP_AICPU, "RETRY_STATE_CMD_STOP_AICPU"},
    {RETRY_STATE_WAIT_AICPU_STOPED, "RETRY_STATE_WAIT_AICPU_STOPED"},
    {RETRY_STATE_CMD_STOP_STREAM, "RETRY_STATE_CMD_STOP_STREAM"},
    {RETRY_STATE_WAIT_STREAM_STOPED, "RETRY_STATE_WAIT_STREAM_STOPED"},
    {RETRY_STATE_CMD_CLEAR_STREAM, "RETRY_STATE_CMD_CLEAR_STREAM"},
    {RETRY_STATE_WAIT_STREAM_CLEARED, "RETRY_STATE_WAIT_STREAM_CLEARED"},
    {RETRY_STATE_CMD_RESET_NOTIFY, "RETRY_STATE_CMD_RESET_NOTIFY"},
    {RETRY_STATE_CMD_STOP_TRANSPORT, "RETRY_STATE_CMD_STOP_TRANSPORT"},
    {RETRY_STATE_WAIT_STOP_TRANSPORT, "RETRY_STATE_WAIT_STOP_TRANSPORT"},
    {RETRY_STATE_WAIT_NOTIFY_RESETED, "RETRY_STATE_WAIT_NOTIFY_RESETED"},
    {RETRY_STATE_WAIT_CMD_CHECK_LINK, "RETRY_STATE_WAIT_CMD_CHECK_LINK"},
    {RETRY_STATE_CMD_RESUME_TRANSPORT, "RETRY_STATE_CMD_RESUME_TRANSPORT"},
    {RETRY_STATE_WAIT_RESUME_TRANSPORT, "RETRY_STATE_WAIT_RESUME_TRANSPORT"},
    {RETRY_STATE_CMD_CHECK, "RETRY_STATE_CMD_CHECK"},
    {RETRY_STATE_WAIT_CHECK_INFO, "RETRY_STATE_WAIT_CHECK_INFO"},
    {RETRY_STATE_CHECK_OP, "RETRY_STATE_CHECK_OP"},
    {RETRY_STATE_CMD_CAN_RETRY, "RETRY_STATE_CMD_CAN_RETRY"},
    {RETRY_STATE_WAIT_CAN_RETRY, "RETRY_STATE_WAIT_CAN_RETRY"},
    {RETRY_STATE_SERVER_RETRY_FAIL, "RETRY_STATE_SERVER_RETRY_FAIL"},
    {RETRY_STATE_CMD_PLAN_SWITCH_NIC, "RETRY_STATE_CMD_PLAN_SWITCH_NIC"},
    {RETRY_STATE_SERVER_WAIT_RESUME, "RETRY_STATE_SERVER_WAIT_RESUME"},
    {RETRY_RESUME_STATE_SERVER_CHECK_LINK, "RETRY_RESUME_STATE_SERVER_CHECK_LINK"},
    {RETRY_RESUME_STATE_SERVER_CHANGE_LINK, "RETRY_RESUME_STATE_SERVER_CHANGE_LINK"},
    
    // agent状态
    {RETRY_STATE_AGENT_RUNNING, "RETRY_STATE_AGENT_RUNNING"},
    {RETRY_STATE_RESP_AICPU_ERR, "RETRY_STATE_RESP_AICPU_ERR"},
    {RETRY_STATE_RESP_LINK_CHECKED, "RETRY_STATE_RESP_LINK_CHECKED"},
    {RETRY_STATE_RESP_LINK_CHANGED, "RETRY_STATE_RESP_LINK_CHANGED"},
    {RETRY_STATE_POLL_AICPU_CHANGED, "RETRY_STATE_POLL_AICPU_CHANGED"},
    {RETRY_STATE_WAIT_CMD_STOP_AICPU, "RETRY_STATE_WAIT_CMD_STOP_AICPU"},
    {RETRY_STATE_POLL_AICPU_STOPED, "RETRY_STATE_POLL_AICPU_STOPED"},
    {RETRY_STATE_RESP_AICPU_STOPED, "RETRY_STATE_RESP_AICPU_STOPED"},
    {RETRY_STATE_WAIT_CMD_STOP_STREAM, "RETRY_STATE_WAIT_CMD_STOP_STREAM"},
    {RETRY_STATE_POLL_STREAM_STOPED, "RETRY_STATE_POLL_STREAM_STOPED"},
    {RETRY_STATE_RESP_STREAM_STOPED, "RETRY_STATE_RESP_STREAM_STOPED"},
    {RETRY_STATE_WAIT_CMD_CLEAR_STREAM, "RETRY_STATE_WAIT_CMD_CLEAR_STREAM"},
    {RETRY_STATE_WAIT_CHANGE_LINK_INFO, "RETRY_STATE_WAIT_CHANGE_LINK_INFO"},
    {RETRY_STATE_RESP_STREAM_CLEARED, "RETRY_STATE_RESP_STREAM_CLEARED"},
    {RETRY_STATE_WAIT_CMD_STOP_TRANSPORT, "RETRY_STATE_WAIT_CMD_STOP_TRANSPORT"},
    {RETRY_STATE_RESP_STOP_TRANSPORT, "RETRY_STATE_RESP_STOP_TRANSPORT"},
    {RETRY_STATE_WAIT_CMD_RESET_NOTIFY, "RETRY_STATE_WAIT_CMD_RESET_NOTIFY"},
    {RETRY_STATE_RESP_NOTIFY_RESETED, "RETRY_STATE_RESP_NOTIFY_RESETED"},
    {RETRY_STATE_WAIT_CMD_RESUME_TRANSPORT, "RETRY_STATE_WAIT_CMD_RESUME_TRANSPORT"},
    {RETRY_STATE_RESP_RESUME_TRANSPORT, "RETRY_STATE_RESP_RESUME_TRANSPORT"},
    {RETRY_STATE_WAIT_CMD_CHECK, "RETRY_STATE_WAIT_CMD_CHECK"},
    {RETRY_STATE_RESP_CHECK_INFO, "RETRY_STATE_RESP_CHECK_INFO"},
    {RETRY_STATE_WAIT_CMD_CAN_RETRY, "RETRY_STATE_WAIT_CMD_CAN_RETRY"},
    {RETRY_STATE_POLL_AICPU_RETRYEND, "RETRY_STATE_POLL_AICPU_RETRYEND"},
    {RETRY_STATE_RESP_AICPU_RETRYEND, "RETRY_STATE_RESP_AICPU_RETRYEND"},
    {RETRY_STATE_RESP_RUNNING_ERR, "RETRY_STATE_RESP_RUNNING_ERR"},
    {RETRY_STATE_WAIT_CMD_RETRY_FAIL, "RETRY_STATE_WAIT_CMD_RETRY_FAIL"},
    {RETRY_STATE_AGENT_RETRY_FAIL, "RETRY_STATE_AGENT_RETRY_FAIL"},
    {RETRY_STATE_SEND_SWITCH_INFO, "RETRY_STATE_SEND_SWITCH_INFO"},
    {RETRY_STATE_WAIT_CMD_SEND_AICPU, "RETRY_STATE_WAIT_CMD_SEND_AICPU"},
    {RETRY_STATE_AGENT_WAIT_RESUME, "RETRY_STATE_AGENT_WAIT_RESUME"},
    {RETRY_RESUME_STATE_AGENT_CHECK_LINK, "RETRY_RESUME_STATE_AGENT_CHECK_LINK"},
    {RETRY_RESUME_STATE_AGENT_CHANGE_LINK, "RETRY_RESUME_STATE_AGENT_CHANGE_LINK"},

    {RETRY_STATE_RESERVED, "RETRY_STATE_RESERVED"}
};

typedef enum {
    RETRY_CMD_RUNNING = 0, // 正常运行
    RETRY_CMD_CHECK_LINK,
    RETRY_CMD_STOP_AICPU,
    RETRY_CMD_STOP_STREAM,
    RETRY_CMD_CLEAR_STREAM,
    RETRY_CMD_STOP_TRANSPORT,
    RETRY_CMD_RESET_NOTIFY,
    RETRY_CMD_RESUME_TRANSPORT,
    RETRY_CMD_CHECK_OPNAME,
    RETRY_CMD_CAN_RETRY,
    RETRY_CMD_RETRY_FAIL,
    RETRY_CMD_NOTIFY_SWITCH_SUC,
    RETRY_CMD_NOTIFY_SWITCH_FAIL,
    RETRY_CMD_RESERVED,
    RETRY_CMD_RETRY_CONSTRAINT_FAIL, // 当前需要上报故障的重执行约束：inplace约束、算子不一致
    RESUME_CMD_CHECK_LINK,
    RESUME_CMD_RUNNING,
} RetryCommand;

const std::map<RetryCommand, std::string> RETRY_COMMAND_STR_MAP {
    {RETRY_CMD_RUNNING, "RETRY_CMD_RUNNING"},
    {RETRY_CMD_CHECK_LINK, "RETRY_CMD_CHECK_LINK"},
    {RETRY_CMD_STOP_AICPU, "RETRY_CMD_STOP_AICPU"},
    {RETRY_CMD_STOP_STREAM, "RETRY_CMD_STOP_STREAM"},
    {RETRY_CMD_CLEAR_STREAM, "RETRY_CMD_CLEAR_STREAM"},
    {RETRY_CMD_STOP_TRANSPORT, "RETRY_CMD_STOP_TRANSPORT"},
    {RETRY_CMD_RESET_NOTIFY, "RETRY_CMD_RESET_NOTIFY"},
    {RETRY_CMD_RESUME_TRANSPORT, "RETRY_CMD_RESUME_TRANSPORT"},
    {RETRY_CMD_CHECK_OPNAME, "RETRY_CMD_CHECK_OPNAME"},
    {RETRY_CMD_CAN_RETRY, "RETRY_CMD_CAN_RETRY"},
    {RETRY_CMD_RETRY_FAIL, "RETRY_CMD_RETRY_FAIL"},
    {RETRY_CMD_NOTIFY_SWITCH_SUC, "RETRY_CMD_NOTIFY_SWITCH_SUC"},
    {RETRY_CMD_NOTIFY_SWITCH_FAIL, "RETRY_CMD_NOTIFY_SWITCH_FAIL"},
    {RETRY_CMD_RESERVED, "RETRY_CMD_RESERVED"},
    {RETRY_CMD_RETRY_CONSTRAINT_FAIL, "RETRY_CMD_RETRY_CONSTRAINT_FAIL"},
    {RESUME_CMD_CHECK_LINK, "RESUME_CMD_CHECK_LINK"},
    {RESUME_CMD_RUNNING, "RESUME_CMD_RUNNING"},
};

// server状态机 WaitResp状态对应的agent状态
const std::map<RetryState, RetryState> RETRY_SERVER_WAIT_AGENT_STATE_LABEL {
    {RETRY_STATE_WAIT_LINK_CHECKED, RETRY_STATE_RESP_LINK_CHECKED},
    {RETRY_STATE_WAIT_AICPU_STOPED, RETRY_STATE_RESP_AICPU_STOPED},
    {RETRY_STATE_WAIT_STREAM_STOPED, RETRY_STATE_RESP_STREAM_STOPED},
    {RETRY_STATE_WAIT_STREAM_CLEARED, RETRY_STATE_RESP_STREAM_CLEARED},
    {RETRY_STATE_WAIT_LINK_CHANGED, RETRY_STATE_RESP_LINK_CHANGED},
    {RETRY_STATE_WAIT_STOP_TRANSPORT, RETRY_STATE_RESP_STOP_TRANSPORT},
    {RETRY_STATE_WAIT_NOTIFY_RESETED, RETRY_STATE_RESP_NOTIFY_RESETED},
    {RETRY_STATE_WAIT_RESUME_TRANSPORT, RETRY_STATE_RESP_RESUME_TRANSPORT},
    {RETRY_STATE_WAIT_CHECK_INFO, RETRY_STATE_RESP_CHECK_INFO},
    {RETRY_STATE_WAIT_CAN_RETRY, RETRY_STATE_RESP_AICPU_RETRYEND},
};
struct OpRetryServerInfo {
    HcclIpAddress hostIP;   // root节点的hostIP
    u32 hostPort;           // root节点的hostPort
    s32 devId;              // devicePhyId
};
struct OpRetryAgentInfo {
    u32 userRank;  // 本group中的userrank
    s32 deviceLogicId;
    HcclIpAddress hostIP;   // 当前rank对应的hostIP
    HcclIpAddress deviceIP; // 当前rank对应的deviceIP
    HcclNetDevCtx netDevCtx;
    HcclNetDevCtx backUpNetDevCtx;
};
struct RetryInfo {
    u32 rankId = 0;
    RetryState retryState = RETRY_STATE_RESERVED; // 重执行状态机当前状态
    bool linkState = true; // 预留, link状态
    KfcExecStatus opInfo;
    bool isChangeLinkFlag = false;  // 当前是否为借轨
    char dfxIpInfo[OPRETRY_DFX_IPINFO_LENGTH] = {0};  // 重执行状态机维测信息（deviceIP + hostIP）
    bool isNeedReportOpRetryErr = false; // 针对重执行算子不一致和inplace场景，上报故障
};
struct RetryCommandInfo{
    RetryCommand command;
    HcclOpIdentifier opId;
};

/* 重执行agent状态机使用 */
using HcclOpStreamRes = std::map<std::string, std::vector<Stream> >;
using OpRetryResetNotifyCallback = std::function<HcclResult(bool, s64)>;
using OpRetrySetTransportStatusCallback = std::function<HcclResult(const HcclOpIdentifier &, bool,
    const std::map<u32, bool> &, const std::map<u32, bool> &, bool)>;
using OpRetryGetSwitchRanksCallback = std::function<HcclResult(u32 *, bool*, u32 &, u8 *, u32 &, bool &, bool &)>;
using OpRetrySetTransportResumeStatusCallBack = std::function<HcclResult(const std::map<u32, bool> &, const std::map<u32, bool> &, bool, bool)>;
}
#endif